# Copyright (c) HySoP 2011-2024
#
# This file is part of HySoP software.
# See "https://particle_methods.gricad-pages.univ-grenoble-alpes.fr/hysop-doc/"
# for further info.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib, math, operator, hashlib
from contextlib import contextmanager
import sympy as sm
from abc import ABCMeta, abstractmethod
from hysop import __VERBOSE__, __KERNEL_DEBUG__
from hysop.core.arrays.all import OpenClArray
from hysop.tools.contexts import nested
from hysop.constants import (
DirectionLabels,
BoundaryCondition,
Backend,
Precision,
SymbolicExpressionKind,
)
from hysop.tools.misc import Utils, upper_pow2_or_3
from hysop.tools.htypes import check_instance, first_not_None, to_tuple
from hysop.tools.numpywrappers import npw
from hysop.tools.numerics import is_complex
from hysop.fields.cartesian_discrete_field import CartesianDiscreteField
from hysop.numerics.remesh.remesh import RemeshKernel
from hysop.fields.continuous_field import Field
from hysop.fields.discrete_field import DiscreteScalarFieldView
from hysop.symbolic import space_symbols as symbolic_space_symbols
from hysop.symbolic import local_indices_symbols as symbolic_local_indices
from hysop.symbolic.array import (
OpenClSymbolicArray,
OpenClSymbolicBuffer,
OpenClSymbolicNdBuffer,
)
from hysop.backend.device.opencl import cl, clTools, clCharacterize
from hysop.backend.device.opencl.opencl_env import OpenClEnvironment
from hysop.backend.device.opencl.opencl_types import OpenClTypeGen
from hysop.backend.device.opencl.opencl_array_backend import OpenClArrayBackend
from hysop.backend.device.codegen import CodeGeneratorWarning
from hysop.backend.device.codegen.base.utils import WriteOnceDict, ArgDict, SortedDict
from hysop.backend.device.codegen.base.statistics import WorkStatistics
from hysop.backend.device.codegen.base.variables import CodegenStruct
from hysop.backend.device.codegen.base.opencl_codegen import OpenClCodeGenerator
from hysop.backend.device.codegen.base.kernel_codegen import KernelCodeGenerator
from hysop.backend.device.codegen.base.variables import (
CodegenVariable,
CodegenVectorClBuiltin,
CodegenArray,
ctype_to_dtype,
)
from hysop.backend.device.codegen.structs.mesh_info import (
MeshBaseStruct,
MeshInfoStruct,
)
from hysop.backend.device.codegen.symbolic.functions.custom_symbolic_function import (
CustomSymbolicFunction,
)
from hysop.operator.base.custom_symbolic_operator import ValidExpressions
from hysop.symbolic.field import SymbolicDiscreteField
from hysop.symbolic.relational import Assignment
from hysop.symbolic.misc import TimeIntegrate
from hysop.backend.device.codegen.symbolic.expr import (
OpenClAssignment,
OpenClVariable,
FunctionCall,
UpdateVars,
IfElse,
)
[docs]
class SymbolicCodegenContext:
"""Store all information required to generate custom code."""
def __init__(
self,
typegen,
expr_info,
ftype,
itype,
vectorization,
granularity,
kernel_dim,
use_short_circuit,
work_dim,
known_vars,
tuning_mode,
debug_mode,
symbolic_mode,
):
vftype = typegen.vtype(ftype, vectorization)
vitype = typegen.vtype(itype, vectorization)
vgranularity_dim = upper_pow2_or_3(granularity)
gftype = typegen.vtype(ftype, vgranularity_dim)
gitype = typegen.vtype(itype, vgranularity_dim)
vkernel_dim = upper_pow2_or_3(kernel_dim)
kftype = typegen.vtype(ftype, vkernel_dim)
kitype = typegen.vtype(itype, vkernel_dim)
array_dim = kernel_dim + granularity
varray_dim = upper_pow2_or_3(array_dim)
aftype = typegen.vtype(ftype, varray_dim)
aitype = typegen.vtype(itype, varray_dim)
self.expr_info = expr_info
self.typegen = typegen
self.work_dim = work_dim
self.array_dim = array_dim
self.kernel_dim = kernel_dim
self.granularity = granularity
self.varray_dim = varray_dim
self.vkernel_dim = vkernel_dim
self.vgranularity_dim = vgranularity_dim
self.itype, self.ftype = itype, ftype
self.vitype, self.vftype = vitype, vftype
self.gitype, self.gftype = gitype, gftype
self.kitype, self.kftype = kitype, kftype
self.aitype, self.aftype = aitype, aftype
self.vectorization = vectorization
self.use_short_circuit = use_short_circuit
self.local_size_known = "local_size" in known_vars
self.tuning_mode = tuning_mode
self.debug_mode = debug_mode
self.symbolic_mode = symbolic_mode
self.known_vars = known_vars
self.array_sizes = SortedDict()
self.array_ghosts = SortedDict()
self.array_contiguous_ghosts = SortedDict()
self.buffer_args = SortedDict()
self.compute_work_per_step()
self.generate_args()
[docs]
def compute_work_per_step(self):
expr_info = self.expr_info
nsteps = expr_info.nsteps
nlhsobjects = expr_info.nlhsobjects
extra_work_per_step = npw.int_zeros(shape=(nsteps, nlhsobjects))
min_ghosts_per_integration_step = expr_info.min_ghosts_per_integration_step
if nsteps > 1:
extra_work_per_step[0 : nsteps - 1] = min_ghosts_per_integration_step[:-1][
::-1
]
extra_vwork_per_step = extra_work_per_step + self.vectorization - 1
extra_vwork_per_step //= self.vectorization
if nlhsobjects > 0:
max_extra_vwork = npw.max(extra_vwork_per_step[0])
else:
max_extra_vwork = 0
self.extra_vwork_per_step = extra_vwork_per_step
self.max_extra_vwork = max_extra_vwork
[docs]
def array_size(self, varname, index=None):
assert varname in self.array_sizes, self.array_sizes.keys()
sizes = self.array_sizes[varname]
if index is not None:
sizes = sizes[index]
(Smem, A, B) = sizes
if self.local_size_known:
assert A == B == 0, f"A={A}, B={B}"
assert Smem >= 0, f"Smem={Smem}"
return str(Smem)
else:
assert Smem == 0, f"Smem={Smem}"
return f"{A}*{self.local_size}+{B}"
[docs]
def array_ghost(self, varname, index=None):
assert varname in self.array_ghosts, self.array_ghosts.keys()
ghosts = self.array_ghosts[varname]
if index is not None:
ghosts = ghosts[index]
return ghosts
[docs]
def generate_args(self):
args = ArgDict()
self.common_args = self.generate_common_args()
self.field_args = self.generate_field_args()
self.array_args = self.generate_array_args()
self.param_args = self.generate_param_args()
self.scalar_args = self.generate_scalar_args()
args.update(self.common_args)
args.update(self.field_args)
args.update(self.array_args)
args.update(self.param_args)
args.update(self.scalar_args)
self.args = args
[docs]
def generate_common_args(self):
tg = self.typegen
args = ArgDict()
args["offset"] = CodegenVariable("offset", self.itype, tg, const=True)
args["local_offset"] = CodegenVariable(
"local_offset", self.itype, tg, const=True
)
args["line_offset"] = CodegenVariable("line_offset", self.itype, tg, const=True)
args["full_offset"] = CodegenVariable("full_offset", self.itype, tg, const=True)
args["last_offset"] = CodegenVariable(
"last_offset", self.itype, tg, const=True, nl=True
)
args["is_first"] = CodegenVariable("is_first", "bool", tg, const=True)
args["is_last"] = CodegenVariable("is_last", "bool", tg, const=True)
args["is_active"] = CodegenVariable("is_active", "bool", tg, const=True)
args["is_first_active"] = CodegenVariable(
"is_first_active", "bool", tg, const=True, nl=True
)
args["is_last_active"] = CodegenVariable(
"is_last_active", "bool", tg, const=True, nl=True
)
args["is_active_boundary"] = CodegenVariable(
"is_active_boundary", "bool", tg, const=True, nl=True
)
args["lid"] = CodegenVariable("lid", self.itype, tg, const=True)
args["local_work"] = CodegenVariable("lwork", self.itype, tg, const=True)
args["current_local_work"] = CodegenVariable(
"clwork", self.itype, tg, const=True, nl=True
)
args["compute_grid_size"] = CodegenVectorClBuiltin(
"compute_grid_size",
self.itype,
self.varray_dim,
typegen=tg,
value=self.expr_info.compute_resolution[::-1],
const=True,
nl=True,
)
args["dx"] = CodegenVariable("dx", tg.fbtype, tg, const=True)
self.space_symbols = SortedDict()
for i, xi in enumerate(symbolic_space_symbols[: self.varray_dim]):
if i == 0:
args[xi.varname] = CodegenVectorClBuiltin(
xi.varname, self.ftype, self.vectorization, typegen=tg
)
else:
args[xi.varname] = CodegenVariable(xi.varname, self.ftype, tg)
self.space_symbols[xi] = args[xi.varname]
self.local_indices_symbols = SortedDict()
for i, Li in enumerate(symbolic_local_indices[: self.varray_dim]):
if i == 0:
args[Li.varname] = CodegenVectorClBuiltin(
Li.varname, self.itype, self.vectorization, typegen=tg
)
else:
args[Li.varname] = CodegenVariable(Li.varname, self.itype, tg)
self.local_indices_symbols[Li] = args[Li.varname]
if self.local_size_known:
local_size = self.known_vars["local_size"][0]
else:
local_size = CodegenVariable("L", self.itype, tg, const=True)
args["L"] = local_size
self.local_size = local_size
for argname, arg in args.items():
setattr(self, argname, arg)
return args
[docs]
def generate_field_args(self):
typegen = self.typegen
expr_info = self.expr_info
min_ghosts = expr_info.min_ghosts_per_components
write_counter = expr_info.discretization_info.write_counter
read_counter = expr_info.discretization_info.read_counter
args = ArgDict()
array_ghosts = SortedDict()
array_contiguous_ghosts = SortedDict()
array_sizes = SortedDict()
dfields = {
f.dfield
for f in set(expr_info.input_dfields.values()).union(
expr_info.output_dfields.values()
)
}
for dfield in dfields:
field = dfield._field
ctype = dfield.ctype
name = dfield.var_name.lower()
if name == dfield.var_name:
name = "_" + name
name = f"{name}_{{}}"
reads = read_counter.get(dfield, None)
writes = write_counter.get(dfield, None)
local_size_per_index = array_sizes.setdefault(
dfield, npw.int_zeros(shape=(dfield.nb_components, 3))
)
for index in range(dfield.nb_components):
is_read = (reads is not None) and (reads[index] > 0)
is_written = (writes is not None) and (writes[index] > 0)
if not (is_read or is_written):
continue
ghosts = min_ghosts[field][index]
has_ghosts = ghosts > 0
cname = name.format(index)
array_ghosts[cname] = ghosts
if has_ghosts:
args[cname] = CodegenVariable(
name=cname,
typegen=typegen,
ctype=ctype,
ptr=True,
ptr_restrict=True,
const=True,
storage="__local",
add_impl_const=True,
nl=True,
)
if self.local_size_known:
local_size_per_index[index] = (
self.vectorization * self.local_size + 2 * ghosts,
0,
0,
)
else:
local_size_per_index[index] = (
0,
self.vectorization,
2 * ghosts,
)
else:
args[cname] = CodegenVectorClBuiltin(
cname,
ctype,
self.vectorization,
typegen=typegen,
const=True,
nl=True,
)
array_contiguous_ghosts[dfield] = min_ghosts[field].copy()
array_ghosts.update(array_contiguous_ghosts)
self.array_sizes.update(array_sizes)
self.array_ghosts.update(array_ghosts)
self.array_contiguous_ghosts.update(array_contiguous_ghosts)
return args
[docs]
def generate_array_args(self):
typegen = self.typegen
expr_info = self.expr_info
min_ghosts = expr_info.min_ghosts_per_components
write_counter = expr_info.discretization_info.write_counter
read_counter = expr_info.discretization_info.read_counter
args = ArgDict()
array_ghosts = SortedDict()
array_contiguous_ghosts = SortedDict()
array_sizes = SortedDict()
arrays = set(expr_info.input_arrays.values()).union(
expr_info.output_arrays.values()
)
for a in arrays:
ctype = a.ctype
name = a.varname.lower()
if name == a.varname:
name = "_" + name
reads = read_counter.get(a, 0)
writes = write_counter.get(a, 0)
ghosts = min_ghosts.setdefault(a, npw.asintarray([0]))
is_read = reads > 0
is_written = writes > 0
has_ghosts = ghosts > 0
assert is_read or is_written
local_size = 0
array_ghosts[name] = ghosts
if has_ghosts:
args[name] = CodegenVariable(
name=name,
typegen=typegen,
ctype=ctype,
ptr=True,
ptr_restrict=True,
const=True,
storage="__local",
add_impl_const=True,
nl=True,
)
if self.local_size_known:
local_size = (
self.vectorization * self.local_size + 2 * ghosts,
0,
0,
)
else:
local_size = (0, self.vectorization, 2 * ghosts)
else:
args[name] = CodegenVectorClBuiltin(
name,
ctype,
self.vectorization,
typegen=typegen,
const=True,
nl=True,
)
local_size = 0
array_sizes[a] = local_size
array_contiguous_ghosts[a] = min_ghosts[a].copy()
array_ghosts.update(array_contiguous_ghosts)
self.array_sizes.update(array_sizes)
self.array_ghosts.update(array_ghosts)
self.array_contiguous_ghosts.update(array_contiguous_ghosts)
return args
[docs]
def generate_param_args(self):
typegen = self.typegen
expr_info = self.expr_info
args = ArgDict()
# READ ONLY PARAMETERS
# (ndim<=1) and (1<=size<=16) => simple vector constant
# (ndim>1) or (size>16) => ptr (const) __constant memory space
for pname, param in expr_info.input_params.items():
assert pname not in expr_info.output_params
shape = param.shape
ctype = param.ctype
if (len(shape) == 0) or ((len(shape) == 1) and (shape[0] <= 16)):
vsize = upper_pow2_or_3(shape[0]) if (len(shape) == 1) else 1
arg = CodegenVectorClBuiltin(
pname, ctype, vsize, typegen=typegen, const=True, nl=True
)
else:
storage = "__constant"
arg = CodegenVariable(
name=pname,
typegen=typegen,
ctype=ctype,
ptr=True,
ptr_restrict=True,
const=True,
storage=storage,
add_impl_const=True,
nl=True,
)
args[pname] = arg
# OUTPUT PARAMETERS
# not supported yet (should be non const __global ptrs).
for pname, param in expr_info.output_params.items():
raise NotImplementedError("Output parameters are not supported.")
return args
[docs]
def generate_scalar_args(self):
typegen = self.typegen
expr_info = self.expr_info
args = ArgDict()
for sname, scalar in expr_info.scalars.items():
ctype = scalar.ctype
vsize = self.vectorization
scalar = CodegenVectorClBuiltin(
sname, ctype, vsize, typegen=typegen, const=True, nl=True
)
args[sname] = scalar
return args
[docs]
class CustomSymbolicKernelGenerator(KernelCodeGenerator, metaclass=ABCMeta):
[docs]
@classmethod
def create(cls, expr_info, **kwds):
"""Kernel generator factory that handles different expression types."""
if expr_info.kind == SymbolicExpressionKind.AFFECT:
from hysop.backend.device.codegen.symbolic.kernels.custom_symbolic_affect import (
CustomSymbolicAffectKernelGenerator,
)
return CustomSymbolicAffectKernelGenerator(expr_info=expr_info, **kwds)
elif expr_info.kind == SymbolicExpressionKind.TIME_INTEGRATE:
from hysop.backend.device.codegen.symbolic.kernels.custom_symbolic_time_integrate import (
CustomSymbolicTimeIntegrateKernelGenerator,
)
return CustomSymbolicTimeIntegrateKernelGenerator(
expr_info=expr_info, **kwds
)
else:
msg = "Expression kind {} is not supported yet."
msg = msg.format(expr_info.kind)
raise RuntimeError(msg)
[docs]
@abstractmethod
def custom_name(cls):
pass
[docs]
@abstractmethod
def generate_expr_code(self):
pass
[docs]
@classmethod
def codegen_name(
cls,
work_dim,
array_dim,
kernel_dim,
granularity,
ftype,
vectorization,
name,
direction,
):
return "{}__{}d_kdim{}_wdim{}_gr{}__{}_v{}".format(
name, array_dim, kernel_dim, work_dim, granularity, ftype, vectorization
)
def __init__(
self,
typegen,
expr_info,
ftype,
kernel_dim,
work_dim,
granularity,
vectorization,
itype="int",
use_short_circuit=None,
symbolic_mode=False,
debug_mode=False,
tuning_mode=False,
known_vars=None,
):
assert vectorization in [1, 2, 4, 8, 16]
use_short_circuit = first_not_None(
use_short_circuit, typegen.use_short_circuit_ops
)
known_vars = first_not_None(known_vars, {})
csc = SymbolicCodegenContext(
typegen,
expr_info,
ftype,
itype,
vectorization,
granularity,
kernel_dim,
use_short_circuit,
work_dim,
known_vars,
tuning_mode,
debug_mode,
symbolic_mode,
)
name = self.codegen_name(
work_dim,
csc.array_dim,
csc.kernel_dim,
csc.granularity,
csc.ftype,
csc.vectorization,
expr_info.name,
expr_info.direction,
)
kernel_reqs = self.build_requirements(csc)
kernel_args = self.gen_kernel_arguments(csc, kernel_reqs)
expr_reqs = self.build_expr_requirements(
csc, kernel_reqs, kernel_args, known_vars
)
kernel_reqs.update(expr_reqs)
super().__init__(
name=name,
typegen=typegen,
work_dim=work_dim,
kernel_args=kernel_args,
known_vars=known_vars,
vec_type_hint=ftype,
symbolic_mode=symbolic_mode,
)
self.update_requirements(kernel_reqs)
self.csc = csc
self.gencode()
[docs]
def build_requirements(self, csc):
typegen = csc.typegen
reqs = WriteOnceDict()
# discrete cartesian fields mesh info
mesh_base_struct = MeshBaseStruct(typegen=typegen, vsize=csc.varray_dim)
reqs["MeshBaseStruct"] = mesh_base_struct
mesh_info_struct = MeshInfoStruct(typegen=typegen, vsize=csc.varray_dim)
reqs["MeshInfoStruct"] = mesh_info_struct
return reqs
[docs]
@abstractmethod
def build_expr_requirements(self, csc, kernel_reqs, kernel_args):
"""Generate requirements and generate new expressions."""
return WriteOnceDict()
[docs]
def required_workgroup_cache_size(self, local_work_size):
"""
Return a tuple of required (static,dynamic,total) cache bytes per workgroup
"""
work_dim = self.work_dim
local_mem_size = self.local_mem_size
local_work_size = npw.asarray(local_work_size)
sc = local_mem_size[0]
dc = local_mem_size[1] * local_work_size[0] + local_mem_size[2]
tc = sc + dc
if dc > 0:
msg = "Dynamic cache has not been implemented yet, "
msg += "please specify local_work_size in known_vars."
raise NotImplementedError(msg)
return (sc, dc, tc)
[docs]
def gen_kernel_arguments(self, csc, kernel_reqs):
expr_info = csc.expr_info
typegen = csc.typegen
kargs = ArgDict()
# declare all array like arguments
mesh_infos = SortedDict()
param_args = SortedDict()
array_args = SortedDict()
array_strides = SortedDict()
# read-only input fields
ei = expr_info
di = expr_info.discretization_info
for obj, counts in di.read_counter.items():
assert counts is not None
if npw.array_equal(counts, 0):
continue
if isinstance(obj, di.IndexedCounterTypes):
assert isinstance(obj, DiscreteScalarFieldView)
dfield = obj
args = array_args.setdefault(obj, {})
strides = array_strides.setdefault(obj, {})
mesh_info_name = f"{dfield.var_name}_mesh_info"
mesh_info = kernel_reqs["MeshInfoStruct"].build_codegen_variable(
const=True, name=mesh_info_name
)
assert dfield not in mesh_infos
mesh_infos[dfield] = mesh_info_name
kargs[mesh_info_name] = mesh_info
for i, count in enumerate(counts):
if count == 0:
continue
if (dfield in di.write_counter) and di.write_counter[dfield][i] > 0:
continue
vname = dfield.var_name + "_" + str(i)
volatile = vname in ei.is_volatile
(arg, stride) = OpenClArrayBackend.build_codegen_arguments(
kargs,
name=vname,
known_vars=csc.known_vars,
symbolic_mode=csc.symbolic_mode,
storage=self._global,
ctype=dfield.ctype,
typegen=typegen,
mesh_dim=csc.varray_dim,
ptr_restrict=True,
const=True,
volatile=volatile,
)
assert i not in args
assert i not in strides
args[i] = arg
strides[i] = stride
elif isinstance(obj, di.SimpleCounterTypes):
assert isinstance(
obj,
(OpenClSymbolicArray, OpenClSymbolicBuffer, OpenClSymbolicNdBuffer),
), type(obj)
assert counts > 0
if (obj in di.write_counter) and (di.write_counter[obj] > 0):
continue
vname = obj.varname
volatile = vname in ei.is_volatile
(arg, stride) = OpenClArrayBackend.build_codegen_arguments(
kargs,
name=vname,
known_vars=csc.known_vars,
symbolic_mode=csc.symbolic_mode,
storage=self._global,
ctype=obj.ctype,
typegen=typegen,
mesh_dim=csc.varray_dim,
ptr_restrict=True,
const=True,
volatile=volatile,
)
if isinstance(obj, (OpenClSymbolicBuffer, OpenClSymbolicNdBuffer)):
csc.buffer_args[obj] = arg
else:
array_args[obj] = {0: arg}
array_strides[obj] = {0: stride}
else:
msg = f"Unsupported type {type(obj)}."
raise TypeError(msg)
# output fields
for obj, counts in di.write_counter.items():
assert counts is not None
if npw.array_equal(counts, 0):
continue
if isinstance(obj, di.IndexedCounterTypes):
assert isinstance(obj, DiscreteScalarFieldView)
dfield = obj
args = array_args.setdefault(dfield, {})
strides = array_strides.setdefault(dfield, {})
if dfield not in mesh_infos:
mesh_info_name = f"{dfield.var_name}_mesh_info"
mesh_info = kernel_reqs["MeshInfoStruct"].build_codegen_variable(
const=True, name=mesh_info_name
)
mesh_infos[dfield] = mesh_info_name
kargs[mesh_info_name] = mesh_info
for i, count in enumerate(counts):
if count == 0:
continue
vname = dfield.var_name + "_" + str(i)
volatile = vname in ei.is_volatile
arg, arg_strides = OpenClArrayBackend.build_codegen_arguments(
kargs,
name=vname,
known_vars=csc.known_vars,
symbolic_mode=csc.symbolic_mode,
storage=self._global,
ctype=dfield.ctype,
typegen=typegen,
mesh_dim=csc.varray_dim,
ptr_restrict=True,
const=False,
volatile=volatile,
)
assert i not in args
assert i not in strides
args[i] = arg
strides[i] = arg_strides
elif isinstance(obj, di.SimpleCounterTypes):
assert isinstance(
obj,
(OpenClSymbolicArray, OpenClSymbolicBuffer, OpenClSymbolicNdBuffer),
), type(obj)
assert counts > 0
vname = obj.varname
volatile = vname in ei.is_volatile
(arg, stride) = OpenClArrayBackend.build_codegen_arguments(
kargs,
name=vname,
known_vars=csc.known_vars,
symbolic_mode=csc.symbolic_mode,
storage=self._global,
ctype=obj.ctype,
typegen=typegen,
mesh_dim=csc.varray_dim,
ptr_restrict=True,
const=False,
volatile=volatile,
)
if isinstance(obj, (OpenClSymbolicBuffer, OpenClSymbolicNdBuffer)):
csc.buffer_args[obj] = arg
else:
array_args[obj] = {0: arg}
array_strides[obj] = {0: stride}
else:
msg = f"Unsupported type {type(obj)}."
raise TypeError(msg)
# parameters
for argname, arg in csc.param_args.items():
param_args[argname] = arg
kargs[argname] = arg
# granularity
if csc.granularity > 0:
gidx = CodegenVectorClBuiltin(
"gidx", "int", csc.vgranularity_dim, typegen=typegen, const=True
)
kargs["gidx"] = gidx
else:
gidx = None
# cache
if not csc.local_size_known:
lmem = CodegenVariable(
storage=self._local,
ctype="uchar",
add_impl_const=True,
name="buffer",
ptr=True,
ptr_restrict=True,
typegen=typegen,
nl=False,
)
kargs["buffer"] = lmem
msg = "Cannot handle dynamic local memory yet, "
msg += "please specity local work group size as a known_vars."
raise NotImplementedError(msg)
else:
lmem = None
self.field_mesh_infos = mesh_infos
self.array_args = array_args
self.array_strides = array_strides
self.param_args = param_args
self.gidx = gidx
self.lmem = lmem
return kargs
def _generate_common_variables(self):
tg = self.typegen
csc = self.csc
itype = csc.itype
varray_dim = csc.varray_dim
vectorization = csc.vectorization
expr_info = csc.expr_info
local_size = self.vars["local_size"]
loop_id = CodegenVectorClBuiltin("vid", itype, varray_dim, typegen=tg)
vectorization_var = CodegenVariable(
"n", itype, tg, const=True, value=vectorization
)
local_work = csc.local_work
max_extra_vwork_var = CodegenVariable(
"extra_vwork", csc.itype, typegen=tg, const=True, value=csc.max_extra_vwork
)
local_work.init = "{}*({}-2*{})".format(
vectorization_var, local_size[0], max_extra_vwork_var
)
vzero = CodegenVectorClBuiltin(
"vzero",
itype,
vectorization,
typegen=tg,
const=True,
value=npw.zeros(vectorization),
)
voffset = CodegenVectorClBuiltin(
"voffset",
itype,
vectorization,
typegen=tg,
const=True,
value=npw.arange(vectorization),
)
azero = CodegenVectorClBuiltin(
"azero",
itype,
varray_dim,
typegen=tg,
const=True,
value=npw.zeros(varray_dim),
)
compute_grid_size = csc.compute_grid_size
self.loop_id = loop_id
self.vectorization_var = vectorization_var
self.local_work = local_work
self.vzero = vzero
self.voffset = voffset
self.azero = azero
self.max_extra_vwork_var = max_extra_vwork_var
return (
compute_grid_size,
loop_id,
vectorization_var,
max_extra_vwork_var,
local_work,
vzero,
voffset,
azero,
)
def _generate_mesh_variables(self):
field_mesh_infos = self.field_mesh_infos
if not field_mesh_infos:
declare_mesh_properties = False
xmin, dx, inv_dx = None, None, None
else:
declare_mesh_properties = True
mesh_info_0 = next(iter(field_mesh_infos.values()))
dx = mesh_info_0["dx"].alias("dx", const=True)
inv_dx = mesh_info_0["inv_dx"].alias("inv_dx", const=True)
xmin = mesh_info_0["local_mesh"]["xmin"].alias("xmin", const=True)
declare_mesh_properties = False
self.dx = dx
self.inv_dx = inv_dx
self.xmin = xmin
return declare_mesh_properties, xmin, dx, inv_dx
def _generate_array_variables(self):
array_args = self.array_args
field_mesh_infos = self.field_mesh_infos
tg = self.typegen
csc = self.csc
di = csc.expr_info.discretization_info
varray_dim = csc.varray_dim
vectorization = csc.vectorization
compute_grid_size = csc.compute_grid_size
itype = csc.itype
vzero = self.vzero
voffset = self.voffset
azero = self.azero
array_gids = SortedDict()
array_vids = SortedDict()
array_grid_ghosts = SortedDict()
array_grid_sizes = SortedDict()
array_line_data = SortedDict()
array_local_data = SortedDict()
array_local_rdata = SortedDict()
array_private_data = SortedDict()
array_values = SortedDict()
local_size_per_field = SortedDict()
local_mem_size = npw.int_zeros(shape=(3,))
has_private_loads, has_private_stores = False, False
has_local_loads, has_local_stores = False, False
for array, array_data in array_args.items():
if isinstance(array, OpenClSymbolicArray):
name = array.varname
elif isinstance(array, DiscreteScalarFieldView):
name = array.var_name
else:
name = array.name
vindex = CodegenVectorClBuiltin(
name + "_vid", itype, varray_dim, typegen=tg
)
write_counts = di.write_counter.get(array, None)
read_counts = di.read_counter.get(array, None)
grid_size_varname = name + "_grid_size"
ghosts_varname = name + "_ghosts"
if array in field_mesh_infos:
# array is a discrete cartesian field (with potentially some ghosts)
mesh_info = field_mesh_infos[array]
grid_size = mesh_info["local_mesh"]["resolution"].alias(
grid_size_varname
)
grid_ghosts = mesh_info["ghosts"].alias(ghosts_varname)
else:
# array is a numpy like array (without ghosts)
grid_size = compute_grid_size.alias(grid_size_varname)
grid_ghosts = azero.alias(ghosts_varname)
indexed_line_data = array_line_data.setdefault(array, {})
indexed_gid = array_gids.setdefault(array, {})
indexed_local_data = array_local_data.setdefault(array, {})
indexed_local_rdata = array_local_rdata.setdefault(array, {})
indexed_private_data = array_private_data.setdefault(array, {})
indexed_values = array_values.setdefault(array, {})
if isinstance(array, OpenClSymbolicArray):
array_ghosts = self.csc.array_contiguous_ghosts[array]
is_read = (read_counts is not None) and (read_counts > 0)
is_written = (write_counts is not None) and (write_counts > 0)
is_ro = is_read and not is_written
is_wo = is_written and not is_read
is_rw = is_read and is_written
gindex = CodegenVariable(f"{name}_gid", "ptrdiff_t", tg, const=True)
line_data = array_data[0].newvar(
f"line_{name}", init=f"{array_data[0]} $+ {gindex}"
)
valname = name.lower()
if valname == name:
valname = f"_{valname}"
ghosts = array_ghosts
if ghosts == 0:
var = CodegenVectorClBuiltin(
valname,
array.ctype,
vectorization,
typegen=tg,
storage="__private",
)
local_size_per_index = (0, 0, 0)
elif csc.local_size_known:
L = self.known_vars["local_size"]
S = csc.vectorization * L[0] + 2 * ghosts
var = CodegenArray(
valname,
dim=1,
ctype=array.ctype,
typegen=tg,
shape=(S,),
storage=self._local,
)
local_size_per_index = (S, 0, 0)
itemsize = array.dtype.itemsize
local_mem_size[0] += S * itemsize
if is_rw:
rvar = CodegenArray(
valname + "_r",
dim=1,
ctype=array.ctype,
typegen=tg,
shape=(2 * ghosts,),
storage=self._local,
)
local_mem_size[0] += 2 * ghosts * itemsize
else:
rvar = None
else:
init = "{} + {}*{} + {}".format(
self.lmem,
local_mem_size[1],
self.vars["local_size"][0],
local_mem_size[2],
)
var = CodegenVariable(
name=valname,
typegen=typegen,
ctype=array.ctype,
ptr=True,
ptr_restrict=True,
const=False,
storage=storage,
add_impl_const=True,
nl=True,
init=init,
)
local_size_per_index = (0, csc.vectorization, 2 * ghosts)
itemsize = array.dtype.itemsize
local_mem_size[1] += csc.vectorization * itemsize
local_mem_size[2] += 2 * ghosts * itemsize
if is_rw:
init = "{} + {}*{} + {}".format(
self.lmem,
local_mem_size[1],
self.vars["local_size"][0],
local_mem_size[2],
)
rvar = CodegenVariable(
name=valname + "_r",
typegen=typegen,
ctype=array.ctype,
ptr=True,
ptr_restrict=True,
const=False,
storage=storage,
add_impl_const=True,
nl=True,
init=init,
)
local_mem_size[2] += 2 * ghosts * itemsize
else:
rvar = None
msg = "Cannot handle offset to different types yet "
msg += "(need to consider alignment)."
raise NotImplementedError(msg)
indexed_gid[0] = gindex
indexed_line_data[0] = line_data
indexed_values[0] = var
if ghosts == 0:
indexed_private_data[0] = var
has_private_loads |= is_read
has_private_stores |= is_written
else:
indexed_local_data[0] = var
indexed_local_rdata[0] = rvar
has_local_loads |= is_read
has_local_stores |= is_written
elif isinstance(array, CartesianDiscreteField):
array_ghosts = self.csc.array_contiguous_ghosts[array]
local_size_per_index = local_size_per_field.setdefault(
array, npw.int_zeros(shape=(array.nb_components, 3))
)
for i, data in array_data.items():
is_read = (read_counts is not None) and (read_counts[i] > 0)
is_written = (write_counts is not None) and (write_counts[i] > 0)
is_ro = is_read and not is_written
is_wo = is_written and not is_read
is_rw = is_read and is_written
gindex = CodegenVariable(
f"{name}_{i}_gid", "ptrdiff_t", tg, const=True
)
line_data = data.newvar(
f"line_{name}_{i}", init=f"{data} $+ {gindex}"
)
valname = name.lower()
if valname == name:
valname = f"_{valname}"
valname += f"_{i}"
ghosts = array_ghosts[i]
if ghosts == 0:
var = CodegenVectorClBuiltin(
valname,
array.ctype,
vectorization,
typegen=tg,
storage="__private",
)
local_size_per_index[i] = (0, 0, 0)
elif csc.local_size_known:
L = self.known_vars["local_size"]
S = csc.vectorization * L[0] + 2 * ghosts
var = CodegenArray(
valname,
dim=1,
ctype=array.ctype,
typegen=tg,
shape=(S,),
storage=self._local,
)
local_size_per_index[i] = (S, 0, 0)
itemsize = array.dtype.itemsize
local_mem_size[0] += S * itemsize
if is_rw:
rvar = CodegenArray(
valname + "_r",
dim=1,
ctype=array.ctype,
typegen=tg,
shape=(2 * ghosts,),
storage=self._local,
)
local_mem_size[0] += 2 * ghosts * itemsize
else:
rvar = None
else:
init = "{} + {}*{} + {}".format(
self.lmem,
local_mem_size[1],
self.vars["local_size"][0],
local_mem_size[2],
)
var = CodegenVariable(
name=valname,
typegen=typegen,
ctype=array.ctype,
ptr=True,
ptr_restrict=True,
const=False,
storage=storage,
add_impl_const=True,
nl=True,
init=init,
)
local_size_per_index[i] = (0, csc.vectorization, 2 * ghosts)
itemsize = array.dtype.itemsize
local_mem_size[1] += csc.vectorization * itemsize
local_mem_size[2] += 2 * ghosts * itemsize
if is_rw:
init = "{} + {}*{} + {}".format(
self.lmem,
local_mem_size[1],
self.vars["local_size"][0],
local_mem_size[2],
)
rvar = CodegenVariable(
name=valname + "_r",
typegen=typegen,
ctype=array.ctype,
ptr=True,
ptr_restrict=True,
const=False,
storage=storage,
add_impl_const=True,
nl=True,
init=init,
)
local_mem_size[2] += 2 * ghosts * itemsize
else:
rvar = None
msg = "Cannot handle offset to different types yet "
msg += "(need to consider alignment)."
raise NotImplementedError(msg)
indexed_gid[i] = gindex
indexed_line_data[i] = line_data
indexed_values[i] = var
if ghosts == 0:
indexed_private_data[i] = var
has_private_loads |= is_read
has_private_stores |= is_written
else:
indexed_local_data[i] = var
indexed_local_rdata[i] = rvar
has_local_loads |= is_read
has_local_stores |= is_written
else:
msg = f"Unsupported array type {type(array)}."
raise TypeError(msg)
if not indexed_local_data:
array_local_data.pop(array)
if not indexed_local_rdata:
array_local_rdata.pop(array)
if not indexed_private_data:
array_private_data.pop(array)
array_vids[array] = vindex
array_grid_sizes[array] = grid_size
array_grid_ghosts[array] = grid_ghosts
self.array_vids = array_vids
self.array_gids = array_gids
self.array_line_data = array_line_data
self.array_grid_sizes = array_grid_sizes
self.array_grid_ghosts = array_grid_ghosts
self.array_values = array_values
self.array_local_data = array_local_data
self.array_local_rdata = array_local_rdata
self.array_private_data = array_private_data
self.local_size_per_field = local_size_per_field
self.local_mem_size = local_mem_size
self.has_private_loads = has_private_loads
self.has_private_stores = has_private_stores
self.has_local_loads = has_local_loads
self.has_local_stores = has_local_stores
return (
array_gids,
array_vids,
array_values,
array_grid_sizes,
array_grid_ghosts,
array_local_data,
array_local_rdata,
array_private_data,
)
def _generate_inner_loop_variables(self):
tg = self.typegen
csc = self.csc
itype = csc.itype
compute_grid_size = csc.compute_grid_size
vectorization = csc.vectorization
loop_id = self.loop_id
local_size = self.vars["local_size"]
local_work = self.local_work
current_local_work = csc.current_local_work
self.current_local_work = current_local_work
local_offset = csc.local_offset
line_offset = csc.line_offset
full_offset = csc.full_offset
last_offset = csc.last_offset
is_first = csc.is_first
is_last = csc.is_last
is_active = csc.is_active
is_first_active = csc.is_first_active
is_last_active = csc.is_last_active
is_active_boundary = csc.is_active_boundary
k = CodegenVariable("k", itype, tg)
kmax = CodegenVariable(
"kmax",
itype,
tg,
const=True,
init="(({}+{lwork}-1)/{lwork})".format(
compute_grid_size[0], lwork=local_work
),
)
self.k = k
self.kmax = kmax
return (
local_offset,
line_offset,
last_offset,
full_offset,
k,
kmax,
current_local_work,
is_first,
is_last,
is_active,
is_first_active,
is_last_active,
is_active_boundary,
)
def _generate_loop_context(self):
csc = self.csc
itype = csc.itype
array_dim = csc.array_dim
compute_grid_size = csc.compute_grid_size
kdim = csc.kernel_dim
wdim = csc.work_dim
granularity = csc.granularity
loop_id = self.loop_id
array_vids = self.array_vids
array_gids = self.array_gids
array_ghosts = self.array_grid_ghosts
array_strides = self.array_strides
local_id = self.vars["local_id"]
global_id = self.vars["global_id"]
local_size = self.vars["local_size"]
global_size = self.vars["global_size"]
local_work = self.local_work
vectorization_var = self.vectorization_var
gidx = self.gidx
(
local_offset,
line_offset,
last_offset,
full_offset,
k,
kmax,
current_local_work,
is_first,
is_last,
is_active,
is_first_active,
is_last_active,
is_active_boundary,
) = self._generate_inner_loop_variables()
if self.work_dim == 1:
kmax.declare(self)
last_offset.declare(
self,
init="{} - {}*({}-1)*{}".format(
compute_grid_size[0], vectorization_var, kmax, local_size[0]
),
)
if granularity > 0:
self.jumpline()
self.decl_vars(*tuple([loop_id] + list(array_vids.values())))
if self.field_mesh_infos:
x0 = csc.space_symbols[symbolic_space_symbols[0]]
self.decl_vars(x0)
if csc.array_dim > 1:
self.decl_vars(
*tuple(
csc.space_symbols[symbolic_space_symbols[i]]
for i in range(1, csc.array_dim)
)
)
i0 = csc.local_indices_symbols[symbolic_local_indices[0]]
self.decl_vars(i0)
if csc.array_dim > 1:
self.decl_vars(
*tuple(
csc.local_indices_symbols[symbolic_local_indices[i]]
for i in range(1, csc.array_dim)
)
)
if granularity > 0:
code = f"{loop_id[kdim:kdim+granularity]} = {gidx[:granularity]};"
self.append(code)
with self._align_() as al:
for array, array_vid in array_vids.items():
ghosts = array_ghosts[array]
code = "{} $= {} $+ {};".format(
array_vid[kdim : kdim + granularity],
loop_id[kdim : kdim + granularity],
ghosts[kdim : kdim + granularity],
)
al.append(code)
for i in range(kdim, kdim + granularity):
idx_i = csc.local_indices_symbols[symbolic_local_indices[i]]
code = idx_i.affect(self, init=loop_id[i])
if self.field_mesh_infos:
fmi = self.field_mesh_infos[next(iter(array_vids))]
for i in range(kdim, kdim + granularity):
xi = csc.space_symbols[symbolic_space_symbols[i]]
code = "{xi} = {x0} + {vid}*{dx};".format(
xi=xi,
vid=next(iter(array_vids.values()))[i],
voffset=self.voffset,
x0=fmi["local_mesh"]["xmin"][i],
dx=fmi["dx"][i],
)
self.append(code)
self.jumpline()
@contextlib.contextmanager
def work_iterate(i):
try:
if i > 0:
j0 = global_id[i] if (i < wdim) else "0"
gsize = global_size[i] if (i < wdim) else "1"
j = loop_id[i]
N = compute_grid_size[i]
decl = ""
unroll = False
else:
j0 = "0"
gsize = "1"
j = k
N = kmax
decl = f"{itype} "
unroll = not csc.tuning_mode
with self._for_(
"{decl}{j}={j0}; {j}<{N}; {j}+={gsize}".format(
decl=decl, j=j, j0=j0, gsize=gsize, N=N
),
unroll=unroll,
) as ctx:
if i > 0:
with self._align_() as al:
al.jumpline()
for vid, ghosts in zip(
array_vids.values(), array_ghosts.values()
):
al.append(
"{} $= {} $+ {};".format(
vid[i], loop_id[i], ghosts[i]
)
)
al.jumpline()
idx_i = csc.local_indices_symbols[symbolic_local_indices[i]]
code = idx_i.affect(self, init=loop_id[i])
if self.field_mesh_infos:
arr, vid = next(
iter(
filter(
lambda kv: kv[0] in self.field_mesh_infos,
array_vids.items(),
)
)
)
fmi = self.field_mesh_infos[arr]
xi = csc.space_symbols[symbolic_space_symbols[i]]
code = "{xi} = {x0} + {vid}*{dx};".format(
xi=xi,
vid=vid[i],
voffset=self.voffset,
x0=fmi["local_mesh"]["xmin"][i],
dx=fmi["dx"][i],
)
self.append(code)
if i == 1:
kmax.declare(self)
last_offset.declare(
self,
init="{} - {}*({}-1)*{}".format(
compute_grid_size[0],
vectorization_var,
kmax,
local_size[0],
),
)
elif i == 0:
with self._align_() as al:
line_offset.declare(
al,
align=True,
const=True,
init="{}*{}".format(k, local_work),
)
local_offset.declare(
al,
align=True,
const=True,
init="{}*({}-{})".format(
vectorization_var,
local_id[0],
self.max_extra_vwork_var,
),
)
full_offset.declare(
al,
align=True,
const=True,
init="{}+{}".format(line_offset, local_offset),
)
self.jumpline()
with self._align_() as al:
al.append(f"{loop_id[0]} $= {line_offset};")
for vid, ghosts in zip(
array_vids.values(), array_ghosts.values()
):
al.append(f"{vid[0]} $= {loop_id[0]} + {ghosts[0]};")
idx_i = csc.local_indices_symbols[symbolic_local_indices[0]]
idx_i.affect(
al, init=f"{full_offset}+{self.voffset}", align=True
)
if self.field_mesh_infos:
arr, vid = next(
iter(
filter(
lambda kv: kv[0] in self.field_mesh_infos,
array_vids.items(),
)
)
)
fmi = self.field_mesh_infos[arr]
xi = csc.space_symbols[symbolic_space_symbols[i]]
code = "{xi} = {x0} + convert_{vftype}({vid}+{voffset}+{lo})*{dx};"
code = code.format(
xi=xi,
vid=vid[i],
lo=local_offset,
voffset=self.voffset,
vftype=csc.vftype,
x0=fmi["local_mesh"]["xmin"][i],
dx=fmi["dx"][i],
)
self.append(code)
with self._align_() as al:
is_first.declare(al, align=True, init=f"({k}==0)")
is_last.declare(al, align=True, init=f"({k}=={kmax}-1)")
init = (
"({fo} >= -{n}*{evwork}) && ({fo} < {S}+{n}*{evwork})"
)
init = init.format(
fo=full_offset,
n=self.vectorization_var,
S=compute_grid_size[0],
evwork=self.max_extra_vwork_var,
)
is_active.declare(al, init=init, align=True)
init = "{} && ({} < 0)".format(
is_active,
full_offset,
vectorization_var,
compute_grid_size[0],
)
is_first_active.declare(al, init=init, align=True)
init = "{} && ({}+{} > {})".format(
is_active,
full_offset,
vectorization_var,
compute_grid_size[0],
)
is_last_active.declare(al, init=init, align=True)
init = f"({is_first_active} || {is_last_active})"
is_active_boundary.declare(al, init=init, align=True)
current_local_work.declare(
al,
align=True,
init="({} ? {} : {})".format(
is_last,
f"{compute_grid_size[0]} - {k}*{local_work}",
local_work,
),
)
self.jumpline()
if self.array_vids:
self.comment("Compute global offsets and line pointers")
with self._align_() as al:
for array, vid in array_vids.items():
gids = array_gids[array]
strides = array_strides[array]
for key, gid in gids.items():
stride = strides[key]
idot = " $+ ".join(
f"{vid[i]}*{stride[i]}"
for i in range(array_dim - 1, -1, -1)
)
gid.declare(al, init=idot, align=True)
self.jumpline()
self.decl_aligned_vars(
*tuple(
aij
for ai in self.array_line_data.values()
for aij in ai.values()
)
)
yield ctx
except:
raise
nested_loops = [work_iterate(i) for i in range(kdim - 1, -1, -1)]
return nested_loops
[docs]
def gencode(self):
s = self
csc = s.csc
tg = s.typegen
expr_info = csc.expr_info
ftype = csc.ftype
global_id = s.vars["global_id"]
local_id = s.vars["local_id"]
group_id = s.vars["group_id"]
global_size = s.vars["global_size"]
local_size = s.vars["local_size"]
num_groups = s.vars["num_groups"]
field_mesh_infos = {k: s.args[v] for (k, v) in self.field_mesh_infos.items()}
self.field_mesh_infos = field_mesh_infos
(
compute_grid_size,
loop_id,
vectorization_var,
max_extra_vwork_var,
local_work,
vzero,
voffset,
azero,
) = s._generate_common_variables()
(
array_gids,
array_vids,
array_values,
array_grid_sizes,
array_grid_ghosts,
array_local_data,
array_local_rdata,
array_private_data,
) = s._generate_array_variables()
event = CodegenVariable("evt", "event_t", tg, init="0")
with s._kernel_():
s.jumpline()
s.comment("Common kernel indices and sizes")
s.decl_aligned_vars(global_id, local_id, group_id, const=True)
s.decl_aligned_vars(global_size, local_size, num_groups, const=True)
s.comment("Common variables")
s.decl_aligned_vars(
compute_grid_size,
azero,
vzero,
voffset,
vectorization_var,
max_extra_vwork_var,
local_work,
)
s.comment("Array specific variables")
s.decl_aligned_vars(*tuple(array_grid_sizes.values()), const=True)
s.decl_aligned_vars(*tuple(array_grid_ghosts.values()), const=True)
s.comment("Global memory arrays")
s.decl_aligned_vars(
*(aij for ai in s.array_args.values() for aij in ai.values())
)
s.decl_aligned_vars(*csc.buffer_args.values())
s.comment("Local and private memory arrays")
s.decl_aligned_vars(
*(
aij
for ai in tuple(array_local_data.values())
+ tuple(array_local_rdata.values())
+ tuple(array_private_data.values())
for aij in filter(lambda x: x, ai.values())
)
)
s.comment("Iterating over array lines")
nested_loops = self._generate_loop_context()
with nested(*nested_loops):
s.load_data(event, local_id)
s.jumpline()
s.compute()
s.jumpline()
s.store_data(event, local_id)
# s.edit()
# s.test_compile()
# import sys
# sys.exit(1)
[docs]
def compute(self):
s = self
s.comment("Compute expressions")
for fcall in self.fcalls:
fcall.fn_kwds["offset"] = self.csc.local_offset
if "dx" in fcall.fn_kwds:
mesh_info_0 = next(iter(self.field_mesh_infos.values()))
dx = mesh_info_0["dx"][0]
fcall.fn_kwds["dx"] = dx
with s._block_():
s.generate_expr_code()
[docs]
def load_data(self, event, local_id):
s = self
csc = s.csc
di = csc.expr_info.discretization_info
has_local_loads = self.has_local_loads
has_private_loads = self.has_private_loads
has_local_right_cache = False
lid = local_id[0]
if not (has_local_loads or has_private_loads):
return
s.comment("Loading data from global memory.")
with s._block_():
if has_local_loads:
_ldata, _lrdata, _gdata, _ghosts = (), (), (), ()
for array in s.array_local_data:
local_data = s.array_local_data[array]
local_rdata = s.array_local_rdata[array]
global_data = s.array_line_data[array]
min_ghosts = csc.array_contiguous_ghosts[array]
read_counts = self.fmt_counter(di.read_counter.get(array, None))
if read_counts is None:
continue
for i in local_data:
if read_counts[i] == 0:
continue
_ldata += (local_data[i],)
_lrdata += (local_rdata[i],)
_gdata += (global_data[i],)
_ghosts += (min_ghosts[i],)
has_local_right_cache |= local_rdata[i] is not None
event.declare(s)
s.comment("Copy previously loaded data from right to left.")
with s._if_(f"!{csc.is_first}"):
with s._align_() as al:
for ldata, lrdata, gdata, ghosts in zip(
_ldata, _lrdata, _gdata, _ghosts
):
cond = f"{lid}<2*{ghosts}"
lhs = ldata[lid]
if lrdata:
rhs = lrdata[lid]
else:
rhs = ldata[f"{s.local_work}+{lid}"]
if csc.use_short_circuit:
code = f"({cond}) $&& ({lhs} $= {rhs}, true);"
else:
code = f"if( {cond} ) ${{ {lhs} $= {rhs}; }}"
al.append(code)
s.barrier(_local=True)
s.comment("Load right local memory from global memory.")
with s._if_(csc.is_first):
with s._align_() as al:
for ldata, lrdata, gdata, ghosts in zip(
_ldata, _lrdata, _gdata, _ghosts
):
src = f"{gdata}$-{ghosts}"
dst = ldata
num_elements = f"{s.current_local_work}$+2*{ghosts}"
code = s.async_work_group_copy(
dst, src, num_elements, event, align=True
)
al.append(code)
with s._else_():
with s._align_() as al:
for ldata, lrdata, gdata, ghosts in zip(
_ldata, _lrdata, _gdata, _ghosts
):
src = f"{gdata}$+{ghosts}"
dst = f"{ldata}$+2*{ghosts}"
num_elements = f"{s.current_local_work}"
code = s.async_work_group_copy(
dst, src, num_elements, event, align=True
)
al.append(code)
if has_private_loads:
s.comment("Load private data from global memory")
ptrs, dsts, default_vals = (), (), ()
for array in s.array_private_data:
private_data = s.array_private_data[array]
global_data = s.array_line_data[array]
read_counts = self.fmt_counter(di.read_counter.get(array, None))
if read_counts is None:
continue
for i in private_data:
if read_counts[i] == 0:
continue
dval = CustomSymbolicFunction.default_out_of_bounds_value(
ctype_to_dtype(global_data[i].ctype)
)
dsts += (private_data[i],)
ptrs += (global_data[i],)
default_vals += (dval,)
cond = "({fo}+{i} >= 0) && ({fo}+{i} < {})"
fcond = lambda i: cond.format(
csc.compute_grid_size[0], fo=csc.full_offset, i=i
)
s.multi_vload_if(
csc.is_active_boundary,
fcond,
csc.vectorization,
csc.local_offset,
ptrs,
dsts,
default_vals,
use_short_circuit=csc.use_short_circuit,
else_cond=csc.is_active,
)
if has_local_loads:
s.comment("Wait for local memory transactions to finish")
code = s.wait_group_events(1, f"&{event}")
s.append(code)
if has_local_right_cache:
s.comment("Copy right loaded local data for read-write arrays.")
with s._if_(f"!{csc.is_last}"):
with s._align_() as al:
for ldata, lrdata, gdata, ghosts in zip(
_ldata, _lrdata, _gdata, _ghosts
):
if lrdata is None:
continue
cond = f"{lid}<2*{ghosts}"
lhs = lrdata[lid]
rhs = ldata[f"{s.local_work}+{lid}"]
if csc.use_short_circuit:
code = f"({cond}) $&& ({lhs} $= {rhs}, true);"
else:
code = f"if( {cond} ) ${{ {lhs} $= {rhs}; }}"
al.append(code)
if has_local_loads or has_local_right_cache:
s.barrier(_local=True)
[docs]
def store_data(self, event, local_id):
s = self
csc = s.csc
di = csc.expr_info.discretization_info
has_local_stores = self.has_local_stores
has_private_stores = self.has_private_stores
lid = local_id[0]
if not (has_local_stores or has_private_stores):
return
s.comment("Loading data back to global memory.")
with s._block_():
if has_local_stores:
_ldata, _lrdata, _gdata, _ghosts = (), (), (), ()
for array in s.array_local_data:
local_data = s.array_local_data[array]
local_rdata = s.array_local_rdata[array]
global_data = s.array_line_data[array]
min_ghosts = csc.array_contiguous_ghosts[array]
write_counts = self.fmt_counter(di.write_counter.get(array, None))
if write_counts is None:
continue
for i in local_data:
if write_counts[i] == 0:
continue
_ldata += (local_data[i],)
_lrdata += (local_rdata[i],)
_gdata += (global_data[i],)
_ghosts += (min_ghosts[i],)
event.declare(s)
s.comment("Load local memory to global memory.")
with s._align_() as al:
for ldata, lrdata, gdata, ghosts in zip(
_ldata, _lrdata, _gdata, _ghosts
):
dst = f"{gdata}"
src = f"{ldata}+{ghosts}"
num_elements = f"{s.current_local_work}"
code = s.async_work_group_copy(
dst, src, num_elements, event, align=True
)
al.append(code)
if has_private_stores:
s.comment("Load private data to global memory")
ptrs, srcs = (), ()
for array in s.array_private_data:
private_data = s.array_private_data[array]
global_data = s.array_line_data[array]
write_counts = self.fmt_counter(di.write_counter.get(array, None))
if write_counts is None:
continue
for i in private_data:
if write_counts[i] == 0:
continue
srcs += (private_data[i],)
ptrs += (global_data[i],)
cond = "({fo}+{i} >= 0) && ({fo}+{i} < {})"
fcond = lambda i: cond.format(
csc.compute_grid_size[0], fo=csc.full_offset, i=i
)
s.multi_vstore_if(
csc.is_active_boundary,
fcond,
csc.vectorization,
csc.local_offset,
srcs,
ptrs,
use_short_circuit=csc.use_short_circuit,
else_cond=csc.is_active,
)
if has_local_stores:
s.comment("Wait for local memory transactions to finish")
code = s.wait_group_events(1, f"&{event}")
s.append(code)
s.barrier(_local=True)
[docs]
def fmt_counter(self, count):
if isinstance(count, int):
return {0: count}
else:
return count